import ssl
import json
ssl._create_default_https_context = ssl._create_unverified_context
#https://search.51job.com/list/010000,000000,0000,00,9,99,+,2,1.html?lang=c&postchannel=0000&workyear=99&cotype=99&degreefrom=99&jobterm=99&companysize=99&ord_field=0&dibiaoid=0&line=&welfare=
#https://search.51job.com/list/010000,000000,0000,00,9,99,python,2,1.html?lang=c&postchannel=0000&workyear=99&cotype=99&degreefrom=99&jobterm=99&companysize=99&ord_field=0&dibiaoid=0&line=&welfare=
#https://search.51job.com/list/010000,000000,0000,00,9,99,java,2,1.html?lang=c&postchannel=0000&workyear=99&cotype=99&degreefrom=99&jobterm=99&companysize=99&ord_field=0&dibiaoid=0&line=&welfare=
url = 'https://search.51job.com/list/010000,000000,0000,00,9,99,python,2,%d.html?lang=c&postchannel=0000&workyear=99&cotype=99&degreefrom=99&jobterm=99&companysize=99&ord_field=0&dibiaoid=0&line=&welfare='
#https://search.51job.com/list/010000,000000,0000,00,9,99,python,2,2.html?lang=c&postchannel=0000&workyear=99&cotype=99&degreefrom=99&jobterm=99&companysize=99&ord_field=0&dibiaoid=0&line=&welfare=
#https://search.51job.com/list/010000,000000,0000,00,9,99,python,2,3.html?lang=c&postchannel=0000&workyear=99&cotype=99&degreefrom=99&jobterm=99&companysize=99&ord_field=0&dibiaoid=0&line=&welfare=
#https://search.51job.com/list/010000,000000,0000,00,9,99,python,2,4.html?lang=c&postchannel=0000&workyear=99&cotype=99&degreefrom=99&jobterm=99&companysize=99&ord_field=0&dibiaoid=0&line=&welfare=
header = {'User-Agent':  'Mozilla/5.0.html (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.html.2171.71 Safari/537.36',
'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8',
'Accept-Encoding': 'gzip, deflate, br',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
 }
import  requests
from  lxml import etree
import re
skill = ['python','java','C++','C#','PHP']
#url = 'https://search.51job.com/list/010000,000000,0000,00,9,99,%s,2,1.html?lang=c&postchannel=0000&workyear=99&cotype=99&degreefrom=99&jobterm=99&companysize=99&ord_field=0&dibiaoid=0&line=&welfare='%skilde
for skilldetail in skill:
    url = 'https://search.51job.com/list/010000,000000,0000,00,9,99,%s,2,1.html?lang=c&postchannel=0000&workyear=99&cotype=99&degreefrom=99&jobterm=99&companysize=99&ord_field=0&dibiaoid=0&line=&welfare=' %skilldetail
    resposne = requests.get(url= url,header=header)
    #intnum = xpath(int)
    for i in range(1,9):
       url = 'https://search.51job.com/list/010000,000000,0000,00,9,99,%s,2,%d.html?lang=c&postchannel=0000&workyear=99&cotype=99&degreefrom=99&jobterm=99&companysize=99&ord_field=0&dibiaoid=0&line=&welfare='%skilldetail%i
       resposne = requests.get(url=url,headers=header)
       res = re.search('window.__SEARCH_RESULT__ =(.*?)</script>',resposne.text)
       #print(res.group(1))
       res = json.loads(res.group(1))  #字符串转字典
       #print(res['engine_search_result'][0]['job_href'])
       url2 = res['engine_search_result'][0]['job_href']
       #print(res['engine_search_result'][0])
       resposne2 = requests.get(url=url2,headers=header)
       res2  = resposne2.content.decode('gbk')
       #re.search('',resposne2.text)
       print(etree.HTML(res2).xpath('//div[@class="tBorderTop_box"]/div[@class="tmsg inbox"]/text()'))
       print(etree.HTML(res2).xpath('//h2/span[@class="bname"]/text()'))
       print(etree.HTML(res2).xpath('//p[@class="cname"]/a/text()')[0])
       print(etree.HTML(res2).xpath('//p[@class="msg ltype"]/text()'))
#找规律
#明确url,测试抓取几次
#写规则抓数据，re,xpath，汉字乱码问题，返回数据json
#翻爬虫问题，频率问题，使用代理ip，或者降低抓取速度，time.sleep()
#存储进数据库

